In [2]:
import importlib
from typing import Dict

import numpy as np
from matplotlib import pyplot as plt

from shared.definitions import TuningResult
from shared.ml_config_core import ModelTrainingResult, TestTrainData
from shared import pipeline, stats_utils
from shared.ml_config_runner import build_production_model_for_tuning_result
from src import data_loader
from shared import graph
from shared import utils
import seaborn as sns
import pandas as pd

from src.utils import TargetType
In [3]:
utils.pandas_config(pd)
utils.plt_config(plt)

sns.set_theme(style="darkgrid", palette="pastel")
plt.style.use("fivethirtyeight")
In [4]:
SELECTED_MODEL = "XGBoostOrdinalRegressor_Default"
INCLUDE_MODELS = [
    # "XGBoostOrdinalRegressor_Default",
    # "XGBoostMulticlassTunePRAUC",
    # "XGBoostMulticlassTuneLogLoss",
    "XGBoostF1Multiclass",
]
VERBOSE = True
In [5]:
importlib.reload(data_loader)
transformed_data = data_loader.load_processed_dataset(
    sample_size=data_loader.SampleSize.Small,
    target_col="grade",
    target_type=TargetType.MulticlassOrdinal,
    drop_cols=["loan_status", "sub_grade", "int_rate"],
)
transformed_data_simplified_grade = transformed_data.copy()

transformed_data_simplified_grade["target__grade"] = np.where(
    transformed_data_simplified_grade["target__grade"].isin([6, 5, 4]),
    4,
    transformed_data_simplified_grade["target__grade"],
)

if VERBOSE:
    print(f"Total samples loaded : {len(transformed_data)}")
Dropping cols where: grade is missing 90000 -> 89996
With transform="pandas", `func` should return a DataFrame to follow the set_output API.
Total samples loaded : 89996
In [ ]:
cv_results_all_models: Dict[str, ModelTrainingResult] = {}

for model_key in INCLUDE_MODELS:
    tuning_result = TuningResult.load_serialized_tuning_result(model_key)

    cv_results = build_production_model_for_tuning_result(
        tuning_result=tuning_result, df=transformed_data
    )
    cv_results_all_models[model_key] = cv_results

    ModelTrainingResult.serialize_model(cv_results, model_key)
In [28]:
if VERBOSE:
    display(cv_results_all_models["XGBoostF1Multiclass"].test_data.test_model)
Pipeline(steps=[('preprocessing',
                 FunctionTransformer(func=<function convert_to_category at 0x7f1aa69abd00>)),
                ('feat_trans_loan_grade', LoanGradeTransformer()),
                ('feat_trans_dti_inc_joint', JointApplicationTransformer()),
                ('feat_trans_fico_score', FICOScoreTransformer()),
                ('feat_trans_delinquency', DelinquencyTransformer()),
                ('feat_trans_inst_income_ratio', Ins...
                               feature_types=None, gamma=0.1, grow_policy=None,
                               importance_type=None,
                               interaction_constraints=None, learning_rate=0.3,
                               max_bin=None, max_cat_threshold=None,
                               max_cat_to_onehot=None, max_delta_step=None,
                               max_depth=7, max_leaves=None,
                               min_child_weight=2.5, missing=nan,
                               monotone_constraints=None, multi_strategy=None,
                               n_estimators=150, n_jobs=None,
                               num_parallel_tree=None,
                               objective='multi:softprob', ...))])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('preprocessing',
                 FunctionTransformer(func=<function convert_to_category at 0x7f1aa69abd00>)),
                ('feat_trans_loan_grade', LoanGradeTransformer()),
                ('feat_trans_dti_inc_joint', JointApplicationTransformer()),
                ('feat_trans_fico_score', FICOScoreTransformer()),
                ('feat_trans_delinquency', DelinquencyTransformer()),
                ('feat_trans_inst_income_ratio', Ins...
                               feature_types=None, gamma=0.1, grow_policy=None,
                               importance_type=None,
                               interaction_constraints=None, learning_rate=0.3,
                               max_bin=None, max_cat_threshold=None,
                               max_cat_to_onehot=None, max_delta_step=None,
                               max_depth=7, max_leaves=None,
                               min_child_weight=2.5, missing=nan,
                               monotone_constraints=None, multi_strategy=None,
                               n_estimators=150, n_jobs=None,
                               num_parallel_tree=None,
                               objective='multi:softprob', ...))])
FunctionTransformer(func=<function convert_to_category at 0x7f1aa69abd00>)
LoanGradeTransformer()
JointApplicationTransformer()
FICOScoreTransformer()
DelinquencyTransformer()
InstallementIncomeRatio()
NewDtiTransformer()
DummyDropAllButFICOHigh(option=<Options.OFF: 0>)
FunctionTransformer(func=<function get_pipeline.<locals>.remove_columns_with_prefix at 0x7f1aa4fc6e60>,
                    kw_args={'prefix': 'target__'})
XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device='cpu', early_stopping_rounds=None,
              enable_categorical=True, eval_metric=None, feature_types=None,
              gamma=0.1, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.3, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=7, max_leaves=None,
              min_child_weight=2.5, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=150, n_jobs=None,
              num_parallel_tree=None, objective='multi:softprob', ...)
In [29]:
def plot_regressor_actual_vs_predicted(data: TestTrainData, model_name: str):
    """
    Renders a scatter plot of actual vs. predicted values with improved styles and annotations.

    Parameters:
    - data: TestTrainData instance containing y_test and predictions.
    """

    if isinstance(data.predictions, pd.DataFrame):
        predictions_series = data.predictions.iloc[:, 0]
    else:
        predictions_series = data.predictions

    plot_data = pd.DataFrame({"Actual": data.y_test, "Predicted": predictions_series})

    # Select a subset if the dataset is very large
    if len(plot_data) > 10000:
        plot_data = plot_data.sample(10000, random_state=42)

    plt.figure(figsize=(10, 8))
    sns.scatterplot(
        data=plot_data, x="Actual", y="Predicted", alpha=0.3, edgecolor=None
    )

    ax_min = min(plot_data["Actual"].min(), plot_data["Predicted"].min())
    ax_max = max(plot_data["Actual"].max(), plot_data["Predicted"].max())
    plt.plot([ax_min, ax_max], [ax_min, ax_max], color="red", lw=2, linestyle="--")

    plt.title(f"{model_name} Actual vs. Predicted Values")
    plt.xlabel("Actual Value")
    plt.ylabel("Predicted Value")
    # plt.text(ax_min, ax_max, 'Diagonal line: Perfect Predictions', color='red', ha='left', va='bottom')

    plt.tight_layout()
    plt.show()
In [26]:
for k, model_data in cv_results_all_models.items():
    if "Regressor" in k:
        test_data = model_data.test_data
        plot_regressor_actual_vs_predicted(test_data, k)
No description has been provided for this image
In [30]:
for k, model_data in cv_results_all_models.items():
    if "Regressor" in k:
        plt.figure(figsize=(11, 9))

        test_data = model_data.test_data
        joint_plot = sns.jointplot(
            x=test_data.y_test, y=test_data.predictions, kind="hex", color="#4CB391"
        )
        joint_plot.fig.suptitle(f"{k}", fontsize=16, y=1.03)
        joint_plot.set_axis_labels("Actual Grade", "Predicted", fontsize=14)

        plt.show()
<Figure size 1100x900 with 0 Axes>
No description has been provided for this image
In [31]:
for k, model_data in cv_results_all_models.items():
    test_data = model_data.test_data

    plt.figure(figsize=(10, 8))

    ax = sns.violinplot(
        x=test_data.y_test, y=test_data.predictions, inner="quart", fill=False
    )
    ax.set_title(f"{k}")
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
In [32]:
predictions_rounded = cv_results_all_models[
    "XGBoostOrdinalRegressor_Default"
].test_data.predictions
In [33]:
predictions_rounded = predictions_rounded.round()
In [10]:
%matplotlib inline
importlib.reload(graph)


def render_multiclass_confusion_matrices(all_models):
    n = len(all_models)
    columns = 2
    rows = (n + 1) // columns
    height = 18
    width = height * columns

    fig, axes = plt.subplots(
        rows, columns, figsize=(width, height * rows), constrained_layout=True
    )
    plt.suptitle("Confusion Matrices: Best Models based on f1", fontsize=20)

    axes_flat = axes.flatten()
    for i, model_key in enumerate(all_models.keys()):
        graph.confusion_matrix_plot_v2(
            all_models[model_key].test_data,
            title=model_key,
            ax=axes_flat[i],
            regressor_input=True,
        )

    for j in range(i + 1, len(axes_flat)):
        axes_flat[j].axis("off")

    plt.show()
In [26]:
importlib.reload(stats_utils)


def render_importance_charts(data, all_models):
    for model_key in all_models.keys():
        model_config = all_models[model_key]
        feature_importances = stats_utils.get_model_feature_importances(
            model_config, data
        )

        graph.render_feature_importances_chart(
            feature_importances=feature_importances,
            title=f"{model_key} Importances",
        )
        if VERBOSE:
            display(feature_importances)
In [37]:
render_multiclass_confusion_matrices(cv_results_all_models)
/home/paulius/miniconda3/envs/rapids_v2/lib/python3.10/site-packages/sklearn/metrics/_classification.py:1471: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
No description has been provided for this image
In [41]:
render_importance_charts(cv_results_all_models)
/home/paulius/data/projects/m3_s3_lending/shared/graph.py:457: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator.
  ax.set_yticklabels(
No description has been provided for this image
Feature Importance
0 term_parser__term 0.200921
1 emp_length_parser__emp_length 0.000000
2 zip__zip_code 0.014028
3 pass__loan_amnt 0.019779
4 pass__installment 0.028261
5 pass__home_ownership 0.000000
6 pass__annual_inc 0.010391
7 pass__verification_status 0.052370
8 pass__purpose 0.033422
9 pass__addr_state 0.000000
10 pass__dti 0.016340
11 pass__delinq_2yrs 0.000000
12 pass__fico_range_low 0.203827
13 pass__fico_range_high 0.000000
14 pass__inq_last_6mths 0.041348
15 pass__mths_since_last_delinq 0.000000
16 pass__mths_since_last_record 0.000000
17 pass__open_acc 0.000000
18 pass__pub_rec 0.000000
19 pass__revol_bal 0.020648
20 pass__revol_util 0.038275
21 pass__total_acc 0.028199
22 pass__initial_list_status 0.046777
23 pass__last_fico_range_high 0.089089
24 pass__last_fico_range_low 0.000000
25 pass__collections_12_mths_ex_med 0.000000
26 pass__mths_since_last_major_derog 0.000000
27 pass__application_type 0.000000
28 pass__verification_status_joint 0.000000
29 pass__inq_fi 0.015884
30 pass__inq_last_12m 0.017836
31 pass__chargeoff_within_12_mths 0.000000
32 pass__mort_acc 0.016369
33 pass__pub_rec_bankruptcies 0.000000
34 pass__tax_liens 0.000000
35 pass__tot_hi_cred_lim 0.019794
36 pass__total_bal_ex_mort 0.000970
37 installment_income_ratio 0.053909
38 new_dti 0.031560
/home/paulius/data/projects/m3_s3_lending/shared/graph.py:457: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator.
  ax.set_yticklabels(
No description has been provided for this image
Feature Importance
0 term_parser__term 0.271462
1 emp_length_parser__emp_length 0.008289
2 zip__zip_code 0.106345
3 pass__loan_amnt 0.061711
4 pass__installment 0.046212
5 pass__home_ownership 0.008468
6 pass__annual_inc 0.012985
7 pass__verification_status 0.016868
8 pass__purpose 0.023965
9 pass__addr_state 0.015981
10 pass__dti 0.015480
11 pass__delinq_2yrs 0.011471
12 pass__fico_range_low 0.085221
13 pass__fico_range_high 0.000000
14 pass__inq_last_6mths 0.023751
15 pass__mths_since_last_delinq 0.009172
16 pass__mths_since_last_record 0.010692
17 pass__open_acc 0.008931
18 pass__pub_rec 0.009866
19 pass__revol_bal 0.010371
20 pass__revol_util 0.010980
21 pass__total_acc 0.010002
22 pass__initial_list_status 0.048485
23 pass__last_fico_range_high 0.020814
24 pass__last_fico_range_low 0.000000
25 pass__collections_12_mths_ex_med 0.013943
26 pass__mths_since_last_major_derog 0.009580
27 pass__application_type 0.013948
28 pass__verification_status_joint 0.014838
29 pass__inq_fi 0.014440
30 pass__inq_last_12m 0.014292
31 pass__chargeoff_within_12_mths 0.013176
32 pass__mort_acc 0.010901
33 pass__pub_rec_bankruptcies 0.012358
34 pass__tax_liens 0.013203
35 pass__tot_hi_cred_lim 0.012728
36 pass__total_bal_ex_mort 0.009067
/home/paulius/data/projects/m3_s3_lending/shared/graph.py:457: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator.
  ax.set_yticklabels(
No description has been provided for this image
Feature Importance
0 term_parser__term 0.186552
1 emp_length_parser__emp_length 0.008174
2 zip__zip_code 0.066410
3 pass__loan_amnt 0.069225
4 pass__installment 0.054875
5 pass__home_ownership 0.008708
6 pass__annual_inc 0.017367
7 pass__verification_status 0.024303
8 pass__purpose 0.027819
9 pass__addr_state 0.014011
10 pass__dti 0.019013
11 pass__delinq_2yrs 0.010187
12 pass__fico_range_low 0.124549
13 pass__fico_range_high 0.000000
14 pass__inq_last_6mths 0.032111
15 pass__mths_since_last_delinq 0.009185
16 pass__mths_since_last_record 0.010386
17 pass__open_acc 0.008975
18 pass__pub_rec 0.007683
19 pass__revol_bal 0.011625
20 pass__revol_util 0.013283
21 pass__total_acc 0.011336
22 pass__initial_list_status 0.055525
23 pass__last_fico_range_high 0.029334
24 pass__last_fico_range_low 0.000000
25 pass__collections_12_mths_ex_med 0.011566
26 pass__mths_since_last_major_derog 0.010041
27 pass__application_type 0.018512
28 pass__annual_inc_joint 0.011335
29 pass__dti_joint 0.017017
30 pass__verification_status_joint 0.013634
31 pass__inq_fi 0.017480
32 pass__inq_last_12m 0.016551
33 pass__chargeoff_within_12_mths 0.006341
34 pass__mort_acc 0.012275
35 pass__pub_rec_bankruptcies 0.012554
36 pass__tax_liens 0.008045
37 pass__tot_hi_cred_lim 0.014740
38 pass__total_bal_ex_mort 0.009274

While the performance for A,B,C grades is relatively acceptable (F1 > ~0.8) performance when predicting the lower grades is very poor (especially for grade G which is almost never classified correctly).

As we have learnt when building our default risk model there difference in returns and other features does not vary as much for lower quality grades which might make it hard to distinguish them. Therefore we'll use the same approach and merge E-F-G grades into a single group.

Merging E-F-G loan grades into a single group¶

In [42]:
cv_results_all_models_simplified_grade: Dict[str, ModelTrainingResult] = {}

for model_key in INCLUDE_MODELS:
    tuning_result = TuningResult.load_serialized_tuning_result(model_key)

    cv_results = build_production_model_for_tuning_result(
        tuning_result=tuning_result, df=transformed_data_simplified_grade
    )
    cv_results_all_models_simplified_grade[model_key] = cv_results

    ModelTrainingResult.serialize_model(cv_results, model_key)
Training: XGBoostF1Multiclass with: {'feat_trans_delinquency__option': 0, 'feat_trans_dti_inc_joint__option': 0, 'feat_trans_dummy_DROP_ALL_BUT_FICO_HIGH__option': 0, 'feat_trans_fico_score__option': 0, 'feat_trans_inst_income_ratio__option': 0, 'feat_trans_loan_grade__option': 0, 'feat_trans_new_dti_after_loan__option': 0, 'model__n_estimators': 150, 'model__min_child_weight': 2.5, 'model__max_depth': 7, 'model__learning_rate': 0.3, 'model__gamma': 0.1}
XGBoostF1Multiclass: 60.8 seconds
In [43]:
render_multiclass_confusion_matrices(cv_results_all_models_simplified_grade)
No description has been provided for this image
In [44]:
render_importance_charts(cv_results_all_models_simplified_grade)
/home/paulius/data/projects/m3_s3_lending/shared/graph.py:457: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator.
  ax.set_yticklabels(
No description has been provided for this image
Feature Importance
0 term_parser__term 0.200921
1 emp_length_parser__emp_length 0.000000
2 zip__zip_code 0.014028
3 pass__loan_amnt 0.019779
4 pass__installment 0.028261
5 pass__home_ownership 0.000000
6 pass__annual_inc 0.010391
7 pass__verification_status 0.052370
8 pass__purpose 0.033422
9 pass__addr_state 0.000000
10 pass__dti 0.016340
11 pass__delinq_2yrs 0.000000
12 pass__fico_range_low 0.203827
13 pass__fico_range_high 0.000000
14 pass__inq_last_6mths 0.041348
15 pass__mths_since_last_delinq 0.000000
16 pass__mths_since_last_record 0.000000
17 pass__open_acc 0.000000
18 pass__pub_rec 0.000000
19 pass__revol_bal 0.020648
20 pass__revol_util 0.038275
21 pass__total_acc 0.028199
22 pass__initial_list_status 0.046777
23 pass__last_fico_range_high 0.089089
24 pass__last_fico_range_low 0.000000
25 pass__collections_12_mths_ex_med 0.000000
26 pass__mths_since_last_major_derog 0.000000
27 pass__application_type 0.000000
28 pass__verification_status_joint 0.000000
29 pass__inq_fi 0.015884
30 pass__inq_last_12m 0.017836
31 pass__chargeoff_within_12_mths 0.000000
32 pass__mort_acc 0.016369
33 pass__pub_rec_bankruptcies 0.000000
34 pass__tax_liens 0.000000
35 pass__tot_hi_cred_lim 0.019794
36 pass__total_bal_ex_mort 0.000970
37 installment_income_ratio 0.053909
38 new_dti 0.031560
/home/paulius/data/projects/m3_s3_lending/shared/graph.py:457: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator.
  ax.set_yticklabels(
No description has been provided for this image
Feature Importance
0 term_parser__term 0.271462
1 emp_length_parser__emp_length 0.008289
2 zip__zip_code 0.106345
3 pass__loan_amnt 0.061711
4 pass__installment 0.046212
5 pass__home_ownership 0.008468
6 pass__annual_inc 0.012985
7 pass__verification_status 0.016868
8 pass__purpose 0.023965
9 pass__addr_state 0.015981
10 pass__dti 0.015480
11 pass__delinq_2yrs 0.011471
12 pass__fico_range_low 0.085221
13 pass__fico_range_high 0.000000
14 pass__inq_last_6mths 0.023751
15 pass__mths_since_last_delinq 0.009172
16 pass__mths_since_last_record 0.010692
17 pass__open_acc 0.008931
18 pass__pub_rec 0.009866
19 pass__revol_bal 0.010371
20 pass__revol_util 0.010980
21 pass__total_acc 0.010002
22 pass__initial_list_status 0.048485
23 pass__last_fico_range_high 0.020814
24 pass__last_fico_range_low 0.000000
25 pass__collections_12_mths_ex_med 0.013943
26 pass__mths_since_last_major_derog 0.009580
27 pass__application_type 0.013948
28 pass__verification_status_joint 0.014838
29 pass__inq_fi 0.014440
30 pass__inq_last_12m 0.014292
31 pass__chargeoff_within_12_mths 0.013176
32 pass__mort_acc 0.010901
33 pass__pub_rec_bankruptcies 0.012358
34 pass__tax_liens 0.013203
35 pass__tot_hi_cred_lim 0.012728
36 pass__total_bal_ex_mort 0.009067
/home/paulius/data/projects/m3_s3_lending/shared/graph.py:457: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator.
  ax.set_yticklabels(
No description has been provided for this image
Feature Importance
0 term_parser__term 0.186552
1 emp_length_parser__emp_length 0.008174
2 zip__zip_code 0.066410
3 pass__loan_amnt 0.069225
4 pass__installment 0.054875
5 pass__home_ownership 0.008708
6 pass__annual_inc 0.017367
7 pass__verification_status 0.024303
8 pass__purpose 0.027819
9 pass__addr_state 0.014011
10 pass__dti 0.019013
11 pass__delinq_2yrs 0.010187
12 pass__fico_range_low 0.124549
13 pass__fico_range_high 0.000000
14 pass__inq_last_6mths 0.032111
15 pass__mths_since_last_delinq 0.009185
16 pass__mths_since_last_record 0.010386
17 pass__open_acc 0.008975
18 pass__pub_rec 0.007683
19 pass__revol_bal 0.011625
20 pass__revol_util 0.013283
21 pass__total_acc 0.011336
22 pass__initial_list_status 0.055525
23 pass__last_fico_range_high 0.029334
24 pass__last_fico_range_low 0.000000
25 pass__collections_12_mths_ex_med 0.011566
26 pass__mths_since_last_major_derog 0.010041
27 pass__application_type 0.018512
28 pass__annual_inc_joint 0.011335
29 pass__dti_joint 0.017017
30 pass__verification_status_joint 0.013634
31 pass__inq_fi 0.017480
32 pass__inq_last_12m 0.016551
33 pass__chargeoff_within_12_mths 0.006341
34 pass__mort_acc 0.012275
35 pass__pub_rec_bankruptcies 0.012554
36 pass__tax_liens 0.008045
37 pass__tot_hi_cred_lim 0.014740
38 pass__total_bal_ex_mort 0.009274

Classifying Subgrades¶

In [6]:
importlib.reload(data_loader)
transformed_data_subgrade = data_loader.load_processed_dataset(
    sample_size=data_loader.SampleSize.Small,
    target_col="sub_grade",
    target_type=TargetType.MulticlassOrdinalExtended,
    drop_cols=["loan_status", "grade", "int_rate"],
)
if VERBOSE:
    print(f"Total samples loaded : {len(transformed_data_subgrade)}")
Dropping cols where: sub_grade is missing 90000 -> 89996
/home/paulius/miniconda3/envs/rapids_v2/lib/python3.10/site-packages/sklearn/preprocessing/_function_transformer.py:345: UserWarning: With transform="pandas", `func` should return a DataFrame to follow the set_output API.
  warnings.warn(
Total samples loaded : 89996
In [7]:
cv_results_all_models_sub_grade: Dict[str, ModelTrainingResult] = {}

for model_key in INCLUDE_MODELS:
    tuning_result = TuningResult.load_serialized_tuning_result(model_key)

    cv_results = build_production_model_for_tuning_result(
        tuning_result=tuning_result, df=transformed_data_subgrade
    )
    cv_results_all_models_sub_grade[model_key] = cv_results

    ModelTrainingResult.serialize_model(cv_results, model_key)
/home/paulius/miniconda3/envs/rapids_v2/lib/python3.10/site-packages/sklearn/metrics/_classification.py:2922: UserWarning: The y_pred values do not sum to one. Starting from 1.5 thiswill result in an error.
  warnings.warn(
Training: XGBoostF1Multiclass with: {'feat_trans_delinquency__option': 0, 'feat_trans_dti_inc_joint__option': 0, 'feat_trans_dummy_DROP_ALL_BUT_FICO_HIGH__option': 0, 'feat_trans_fico_score__option': 0, 'feat_trans_inst_income_ratio__option': 0, 'feat_trans_loan_grade__option': 0, 'feat_trans_new_dti_after_loan__option': 0, 'model__n_estimators': 150, 'model__min_child_weight': 2.5, 'model__max_depth': 7, 'model__learning_rate': 0.3, 'model__gamma': 0.1}
XGBoostF1Multiclass: 293.7 seconds

Predicting Sub-grades¶

We've attempted to build a model which predicts individual sub-grades (e.g. A1, A2 ... G5) in addition to top level grades. An XGBoost multi-classificaiton model was used, however the performance was unsatisfactory. We've included the overal performance summary and feature importances below. However, we decide to not provide an indepth analysis because the model would not be useful for any practical applications (a different approach would probably suit this problem better due to the high number of classes and their ordinal nature)

In [16]:
for model_key, model_results in cv_results_all_models_sub_grade.items():
    print(model_key)
    print(model_results.test_data.metrics)
XGBoostF1Multiclass
{'f1': 0.199, 'accuracy': 0.2733, 'precision': 0.2108, 'recall': 0.1947, 'log_loss': 2}
None
In [27]:
render_importance_charts(transformed_data_subgrade, cv_results_all_models_sub_grade)
/home/paulius/data/projects/m3_s3_lending/shared/graph.py:457: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator.
  ax.set_yticklabels(
No description has been provided for this image
Feature Importance
0 term_parser__term 0.106485
1 emp_length_parser__emp_length 0.016260
2 zip__zip_code 0.063670
3 pass__loan_amnt 0.048561
4 pass__installment 0.046285
5 pass__home_ownership 0.017024
6 pass__annual_inc 0.020558
7 pass__verification_status 0.024662
8 pass__purpose 0.026665
9 pass__addr_state 0.025046
10 pass__dti 0.021775
11 pass__delinq_2yrs 0.017929
12 pass__fico_range_low 0.065871
13 pass__fico_range_high 0.000000
14 pass__inq_last_6mths 0.027946
15 pass__mths_since_last_delinq 0.017716
16 pass__mths_since_last_record 0.019174
17 pass__open_acc 0.016289
18 pass__pub_rec 0.020212
19 pass__revol_bal 0.018316
20 pass__revol_util 0.019093
21 pass__total_acc 0.017927
22 pass__initial_list_status 0.033912
23 pass__last_fico_range_high 0.024567
24 pass__last_fico_range_low 0.000000
25 pass__collections_12_mths_ex_med 0.016248
26 pass__mths_since_last_major_derog 0.017968
27 pass__application_type 0.026973
28 pass__annual_inc_joint 0.021111
29 pass__dti_joint 0.022676
30 pass__verification_status_joint 0.022423
31 pass__inq_fi 0.023245
32 pass__inq_last_12m 0.022318
33 pass__chargeoff_within_12_mths 0.018417
34 pass__mort_acc 0.017800
35 pass__pub_rec_bankruptcies 0.020849
36 pass__tax_liens 0.016260
37 pass__tot_hi_cred_lim 0.020096
38 pass__total_bal_ex_mort 0.017674